{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "866c2275",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import our programs\n",
    "import requests\n",
    "import re\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "from urllib import request,response\n",
    "from pathlib import Path\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9dbbef39",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload the data\n",
    "df = pd.read_csv('/Users/alenasmith/Library/CloudStorage/GoogleDrive-asvinod@stanford.edu/Shared drives/Johnson Congress Project/Replication Files/Dictionary/Dictionary Data/dcinbox.csv', dtype=object, encoding='latin-1')\n",
    "\n",
    "\n",
    "# Clean the data\n",
    "\n",
    "# Make the text a string & lowercase\n",
    "df['Body'] = df['Body'].astype(str).str.lower()\n",
    "\n",
    "# remove new lines\n",
    "df['Body'] = df['Body'].str.replace(r'\\n',' ', regex=True) \n",
    "\n",
    "# remove two (or more) spaces in a row\n",
    "df['Body'] = df['Body'].str.replace(' +', ' ', regex=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b36c9592",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lists of words to search for\n",
    "god_terms = ['god', 'jesus', 'christ',\n",
    "             'savior', 'saviour', 'our lord', 'the lord',\n",
    "             'holy spirit']\n",
    "blessings_terms = ['bless', 'faith', 'pray', 'divine', 'salvation', 'worship',\n",
    "                   ' sin ', ' sins ', 'sinful', 'sinner', 'original sin',\n",
    "                   'heaven', 'holy ', 'spiritual']     \n",
    "scripture_terms = ['scripture', 'bible', 'biblical', 'psalm', 'ten commandments']\n",
    "religion_terms = ['religio', 'christian', 'gospel']\n",
    "political_terms = ['judeo',\n",
    "                   'religious freedom', 'religious libert', 'freedom of religion', \n",
    "                   'christian nation']\n",
    "all_terms = ['god', 'jesus', 'christ',\n",
    "             'savior', 'saviour', 'our lord', 'the lord',\n",
    "             'holy spirit', \n",
    "             'scripture', 'bible', 'biblical', 'psalm', 'ten commandments',\n",
    "             'bless', 'pray', 'faith', 'divine', 'salvation', 'worship',\n",
    "             ' sin ', ' sins ', 'sinful', 'sinner', 'original sin',\n",
    "             'heaven', 'holy ', 'spiritual',\n",
    "             'religio', 'christian', 'gospel']\n",
    "whistles_terms = ['pornography', \n",
    "                  'gender ideology', 'gender agenda',\n",
    "                  'pro-family', 'traditional famil', 'traditional marriage',\n",
    "                 'unborn', 'pro-life', 'protect life', 'protecting life', 'abortion agenda']\n",
    "\n",
    "\n",
    "# Create new variables to store the counts\n",
    "df['god_terms_count'] = 0\n",
    "df['blessings_terms_count'] = 0\n",
    "df['scripture_terms_count'] = 0\n",
    "df['religion_terms_count'] = 0\n",
    "df['political_terms_count'] = 0\n",
    "df['all_terms_count'] = 0\n",
    "df['whistles_terms_count'] = 0\n",
    "\n",
    "\n",
    "# Function to build regex pattern for exceptions for each word\n",
    "def build_pattern(word):\n",
    "    # Handle specific exclusions\n",
    "    if word == 'god':\n",
    "        return r'\\b(?!god forbid\\b)(?!goddard\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'christ':\n",
    "        return r'\\b(?!\\w*christmas\\b)(?!christopher\\b)(?!christian\\b)(?!christians\\b)(?!christianity\\b)(?!christmases\\b)(?!christa\\b)(?!christi\\b)(?!christie\\b)(?!christine\\b)(?!christina\\b)(?!christy\\b)(?!christmastime\\b)(?!christon\\b)(?!gilchrist\\b)(?!christiansted\\b)(?!christensen\\b)(?!christen\\b)(?!christening\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'the lord':\n",
    "        return r'\\b(?!the lords and ladies\\b)(?!lord lieutenant\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'faith':\n",
    "        return r'\\b(?!faith and credit\\b)(?!faithfully execute\\b)(?!faithfully executed\\b)(?!faithfully to execute\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'pray':\n",
    "        return r'\\b(?!prayer and pledge\\b)(?!prayer and the pledge\\b)(?!spray\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'salvation':\n",
    "        return r'\\b(?!salvation army\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'holy':\n",
    "        return r'\\b(?!\\w*holyoke\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'gospel':\n",
    "        return r'\\b(?!gospel music\\b)(?!gospel lyres\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'heaven': \n",
    "        return r'\\b(?!heaven forbid\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'protect life':\n",
    "        return r'\\b(?!protect life and property\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'protecting life':\n",
    "        return r'\\b(?!protecting life and property\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    else:\n",
    "        return r'\\b{}\\w*\\b'.format(re.escape(word))\n",
    "\n",
    "\n",
    "# Iterate over each list of words and count their occurrences\n",
    "for terms, column in [(god_terms, 'god_terms_count'), (blessings_terms, 'blessings_terms_count'), \n",
    "                      (scripture_terms, 'scripture_terms_count'), (religion_terms, 'religion_terms_count'),\n",
    "                      (political_terms, 'political_terms_count'), (all_terms, 'all_terms_count'),\n",
    "                     (whistles_terms, 'whistles_terms_count')]:\n",
    "    for word in terms:\n",
    "        # Use regular expressions to match variations of the word\n",
    "        pattern = build_pattern(word)\n",
    "        \n",
    "        # Count occurrences using the pattern with DOTALL flag\n",
    "        df[column] += df['Body'].str.lower().str.count(pattern, flags=re.DOTALL)\n",
    "\n",
    "        \n",
    "\n",
    "# Making binaries off of our dictionaries as created above\n",
    "\n",
    "# Create binary variables based on counts\n",
    "df['god_terms_binary'] = (df['god_terms_count'] > 0).astype(int)\n",
    "df['blessings_terms_binary'] = (df['blessings_terms_count'] > 0).astype(int)\n",
    "df['scripture_terms_binary'] = (df['scripture_terms_count'] > 0).astype(int)\n",
    "df['religion_terms_binary'] = (df['religion_terms_count'] > 0).astype(int)\n",
    "df['political_terms_binary'] = (df['political_terms_count'] > 0).astype(int)\n",
    "df['all_terms_binary'] = (df['all_terms_count'] > 0).astype(int)\n",
    "df['whistles_terms_binary'] = (df['whistles_terms_count'] > 0).astype(int)\n",
    "\n",
    "\n",
    "# Export the final data (NOTE the further instructions below)\n",
    "\n",
    "df.to_csv('DCinbox_dict.csv', index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7bc6e6a3",
   "metadata": {},
   "source": [
    "Minor edits to do after exporting data:\n",
    "- Remove \"anti-abortion agenda\" (3 instances - rows 4698, 5993, 13569). Change the 1 or 2 in whistles_terms_count and whistles_terms_binary to 0. This will make the entire row be 0's for all these rows.\n",
    "- Add to \" sin.\" (row 1127) \" sin,\" (row 16248) \" sins.\" (rows 143, 10638, 10650 (increase by 2), 13997 (increase by 2), 16396, 16600, 16601) \" sins,\" (rows 1334, 10553, 10554, 10650 (increase by 2), 14041, 16248). Note that for all of these you will increase the blessings_terms_count and the all_terms_count by 1, and if blessings_terms_binary or all_terms_binary are 0 you will change those to 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "242792d1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
